In [87]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# https://github.com/Gracee2024/5622-final
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from plotly.subplots import make_subplots
import plotly.graph_objects as go
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv /kaggle/input/playground-series-s4e1/sample_submission.csv /kaggle/input/playground-series-s4e1/train.csv /kaggle/input/playground-series-s4e1/test.csv
This project is about Customer Churn, Customer churn is the percentage of customers who stop buying a business's products or services over a certain period of time. It's also known as customer attrition, customer turnover, or customer defection. This is a typical binary classification problem, and we will predict customer churn in banking industry using machine learning.
Objective: The loss of clients or customers, holds significant importance for Bank. Forecasting potential churn for individual customers can assist in implementing strategies to prevent their loss. The goal is to predict whether a customer continues with their account or closes it.
Table of contents
Table of contents
Table of contents¶
Dataset Description
Dataset Description
1. Dataset Description¶
The dataset for this competition (both train and test) was generated from a deep learning model trained on the Bank Customer Churn Prediction dataset. Feature distributions are close to, but not exactly the same, as the original.
Customer ID: A unique identifier for each customerSurname: The customer's surname or last nameCredit Score: A numerical value representing the customer's credit scoreGeography: The country where the customer resides (France, Spain or Germany)Gender: The customer's gender (Male or Female)Age: The customer's age.Tenure: The number of years the customer has been with the bankBalance: The customer's account balanceNumOfProducts: The number of bank products the customer uses (e.g., savings account, credit card)HasCrCard: Whether the customer has a credit card (1 = yes, 0 = no)IsActiveMember: Whether the customer is an active member (1 = yes, 0 = no)EstimatedSalary: The estimated salary of the customerExited: Whether the customer has churned (1 = yes, 0 = no)
Data Source By:
- Reade, W., & Chow, A. (2024). Binary Classification with a Bank Churn Dataset. Kaggle. Retrieved from https://kaggle.com/competitions/playground-series-s4e1
In [32]:
train_df = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')
In [33]:
train_df.head()
Out[33]:
| id | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 15674932 | Okwudilichukwu | 668 | France | Male | 33.0 | 3 | 0.00 | 2 | 1.0 | 0.0 | 181449.97 | 0 |
| 1 | 1 | 15749177 | Okwudiliolisa | 627 | France | Male | 33.0 | 1 | 0.00 | 2 | 1.0 | 1.0 | 49503.50 | 0 |
| 2 | 2 | 15694510 | Hsueh | 678 | France | Male | 40.0 | 10 | 0.00 | 2 | 1.0 | 0.0 | 184866.69 | 0 |
| 3 | 3 | 15741417 | Kao | 581 | France | Male | 34.0 | 2 | 148882.54 | 1 | 1.0 | 1.0 | 84560.88 | 0 |
| 4 | 4 | 15766172 | Chiemenam | 716 | Spain | Male | 33.0 | 5 | 0.00 | 2 | 1.0 | 1.0 | 15068.83 | 0 |
In [34]:
test_df.head()
Out[34]:
| id | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 165034 | 15773898 | Lucchese | 586 | France | Female | 23.0 | 2 | 0.00 | 2 | 0.0 | 1.0 | 160976.75 |
| 1 | 165035 | 15782418 | Nott | 683 | France | Female | 46.0 | 2 | 0.00 | 1 | 1.0 | 0.0 | 72549.27 |
| 2 | 165036 | 15807120 | K? | 656 | France | Female | 34.0 | 7 | 0.00 | 2 | 1.0 | 0.0 | 138882.09 |
| 3 | 165037 | 15808905 | O'Donnell | 681 | France | Male | 36.0 | 8 | 0.00 | 1 | 1.0 | 0.0 | 113931.57 |
| 4 | 165038 | 15607314 | Higgins | 752 | Germany | Male | 38.0 | 10 | 121263.62 | 1 | 1.0 | 0.0 | 139431.00 |
- number of samples/rows and features/columns
In [35]:
num_train_rows, num_train_columns = train_df.shape
num_test_rows, num_test_columns = test_df.shape
print("Training Data:")
print(f"Number of Rows: {num_train_rows}")
print(f"Number of Columns: {num_train_columns}\n")
print("Test Data:")
print(f"Number of Rows: {num_test_rows}")
print(f"Number of Columns: {num_test_columns}\n")
Training Data: Number of Rows: 165034 Number of Columns: 14 Test Data: Number of Rows: 110023 Number of Columns: 13
In [36]:
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 165034 entries, 0 to 165033 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 165034 non-null int64 1 CustomerId 165034 non-null int64 2 Surname 165034 non-null object 3 CreditScore 165034 non-null int64 4 Geography 165034 non-null object 5 Gender 165034 non-null object 6 Age 165034 non-null float64 7 Tenure 165034 non-null int64 8 Balance 165034 non-null float64 9 NumOfProducts 165034 non-null int64 10 HasCrCard 165034 non-null float64 11 IsActiveMember 165034 non-null float64 12 EstimatedSalary 165034 non-null float64 13 Exited 165034 non-null int64 dtypes: float64(5), int64(6), object(3) memory usage: 17.6+ MB
In [37]:
train_df.describe().T.style.background_gradient()
Out[37]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| id | 165034.000000 | 82516.500000 | 47641.356500 | 0.000000 | 41258.250000 | 82516.500000 | 123774.750000 | 165033.000000 |
| CustomerId | 165034.000000 | 15692005.019026 | 71397.816791 | 15565701.000000 | 15633141.000000 | 15690169.000000 | 15756824.000000 | 15815690.000000 |
| CreditScore | 165034.000000 | 656.454373 | 80.103340 | 350.000000 | 597.000000 | 659.000000 | 710.000000 | 850.000000 |
| Age | 165034.000000 | 38.125888 | 8.867205 | 18.000000 | 32.000000 | 37.000000 | 42.000000 | 92.000000 |
| Tenure | 165034.000000 | 5.020353 | 2.806159 | 0.000000 | 3.000000 | 5.000000 | 7.000000 | 10.000000 |
| Balance | 165034.000000 | 55478.086689 | 62817.663278 | 0.000000 | 0.000000 | 0.000000 | 119939.517500 | 250898.090000 |
| NumOfProducts | 165034.000000 | 1.554455 | 0.547154 | 1.000000 | 1.000000 | 2.000000 | 2.000000 | 4.000000 |
| HasCrCard | 165034.000000 | 0.753954 | 0.430707 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| IsActiveMember | 165034.000000 | 0.497770 | 0.499997 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| EstimatedSalary | 165034.000000 | 112574.822734 | 50292.865585 | 11.580000 | 74637.570000 | 117948.000000 | 155152.467500 | 199992.480000 |
| Exited | 165034.000000 | 0.211599 | 0.408443 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
In [38]:
test_df.describe().T.style.background_gradient()
Out[38]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| id | 110023.000000 | 220045.000000 | 31761.048671 | 165034.000000 | 192539.500000 | 220045.000000 | 247550.500000 | 275056.000000 |
| CustomerId | 110023.000000 | 15692096.605101 | 71684.990992 | 15565701.000000 | 15632859.000000 | 15690175.000000 | 15756926.000000 | 15815690.000000 |
| CreditScore | 110023.000000 | 656.530789 | 80.315415 | 350.000000 | 597.000000 | 660.000000 | 710.000000 | 850.000000 |
| Age | 110023.000000 | 38.122205 | 8.861550 | 18.000000 | 32.000000 | 37.000000 | 42.000000 | 92.000000 |
| Tenure | 110023.000000 | 4.996637 | 2.806148 | 0.000000 | 3.000000 | 5.000000 | 7.000000 | 10.000000 |
| Balance | 110023.000000 | 55333.611354 | 62788.519675 | 0.000000 | 0.000000 | 0.000000 | 120145.605000 | 250898.090000 |
| NumOfProducts | 110023.000000 | 1.553321 | 0.544714 | 1.000000 | 1.000000 | 2.000000 | 2.000000 | 4.000000 |
| HasCrCard | 110023.000000 | 0.753043 | 0.431244 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| IsActiveMember | 110023.000000 | 0.495233 | 0.499980 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| EstimatedSalary | 110023.000000 | 112315.147765 | 50277.048244 | 11.580000 | 74440.325000 | 117832.230000 | 154631.350000 | 199992.480000 |
Data Cleaning
Data Cleaning
2. Data Cleaning¶
- Null values
In [39]:
# Null Values in Train
train_null = train_df.isnull().sum().sum()
#Null Count in Test
test_null = test_df.isnull().sum().sum()
print(f'Null Count in Train: {train_null}')
print(f'Null Count in Test: {test_null}')
Null Count in Train: 0 Null Count in Test: 0
As we can see there have no null values in the both train and test data.
- Duplicates Values
In [40]:
# Count duplicate rows in train_data
train_duplicates = train_df.duplicated().sum()
# Count duplicate rows in test_data
test_duplicates = test_df.duplicated().sum()
# Print the results
print(f"Number of duplicate rows in train_data: {train_duplicates}")
print(f"Number of duplicate rows in test_data: {test_duplicates}")
Number of duplicate rows in train_data: 0 Number of duplicate rows in test_data: 0
As we can see there have no duplicate values in the both train and test data.
Observation
- There are 165034 rows and 14 columns in the dataset.
- The columns having float datatypes are 5, integer datatype 6, object datatype are 3
- The data does not contain any missing values
- The target variable in the dataset is Exited.
In [86]:
num_cols = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']
# Create subplots with two rows and two columns
fig = make_subplots(rows=2, cols=2)
# Loop through numerical columns and add boxplots with color
for i, col in enumerate(num_cols, start=1):
row_num = 1 if i <= 2 else 2
col_num = i if i <= 2 else i - 2
fig.add_trace(
go.Box(
x=train_df[col],
name=col,
),
row=row_num,
col=col_num
)
# Update layout
fig.update_layout(
title_text="Boxplots of Numerical Columns",
showlegend=False
)
fig.show()